#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.66 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
library(expss)
library(polycor)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 36 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #53B400 #00BD64 #C17FFF #F17E4F #E76BF3 #F365E6 #C49A00 #FD6F86 #8195FF
## 1930 1555 1339 1253 1149 1086 977 942 856
## #00C0BB #4B9FFF #00C094 #00BB45 #00B70C #00BF7D #D29300 #FF61C5 gray
## 738 575 566 414 400 327 314 247 178
## #E88523 #F8766D #00A8FF #FF64B2 #75AF00 #8EAB00 #A58AFF #A3A500 #00BECD
## 176 160 131 128 125 120 114 99 97
## #D774FD #00B0F6 #00B6EB #00BBDD #FF699D #B4A000 #DE8C00 #FB61D7 #00C1A8
## 97 95 94 82 72 45 45 42 32
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)
Selecting the three modules with highest (absolute) correlation to Diagnosis: #C17FFF #00C094 and #C49A00
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(ImportantModules = ifelse(Module=='#C17FFF', '#C17FFF',
ifelse(Module=='#00C094', '#00C094',
ifelse(Module=='#C49A00', '#C49A00', 'Others')))) %>%
mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.1, 0.5))
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color) + theme_minimal() +
ggtitle('Modules with strongest relation to Diagnosis'))
Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0, and thi behaviour is the same for the other clusterings I have tried…
plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>%
group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>%
left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>%
mutate(p=round(p/n*100,2))
for(i in 1:nrow(plot_data)){
this_row = plot_data[i,]
if(this_row$hasSFARIscore==FALSE & this_row$p==100){
new_row = this_row
new_row$hasSFARIscore = TRUE
new_row$p = 0
plot_data = plot_data %>% rbind(new_row)
}
}
plot_data = plot_data %>% filter(hasSFARIscore==TRUE)
ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
geom_point(color=plot_data$Module, aes(id=Module)) +
xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row)
Gene significance close to 0 means the gene is not significant and negative values means the gene is underexpressed in Autism samples. So the important thing about gene significance is it’s absolute value
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID')
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
theme_minimal() + ggtitle('Gene Significance'))
Module with the highest (absolute) correlation to Diagnosis (-0.92)
plot_data = dataset %>% dplyr::select(MM.C17FFF, GS, gene.score) %>% filter(dataset$Module=='#C17FFF')
ggplotly(plot_data %>% ggplot(aes(MM.C17FFF, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal())
Module with the second highest (absolute) correlation to Diagnosis (-0.86)
plot_data = dataset %>% dplyr::select(MM.C49A00, GS, gene.score) %>% filter(dataset$Module=='#C49A00')
ggplotly(plot_data %>% ggplot(aes(MM.C49A00, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(2:6,8,7))) +
theme_minimal())
Module with the third highest (absolute) correlation to Diagnosis and the top one with positive correlation (0.86)
plot_data = dataset %>% dplyr::select(MM.00C094, GS, gene.score) %>% filter(dataset$Module=='#00C094')
ggplotly(plot_data %>% ggplot(aes(MM.00C094, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(3:6,8,7))) +
theme_minimal())
Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance
*Ordered by \(\frac{MM+GS}{2}\)
Only one SFARI Gene in the two lists …
top_genes_1 = dataset %>% dplyr::select(ID, MM.C17FFF, GS, gene.score) %>% filter(dataset$Module=='#C17FFF') %>%
rename('MM' = 'MM.C17FFF') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_1, caption='Top 10 genes for module #C17FFF')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000050748 | 0.8988150 | -0.9399810 | None | 0.9193980 |
| ENSG00000108395 | 0.9029160 | -0.9250494 | None | 0.9139827 |
| ENSG00000138078 | 0.8879452 | -0.8966280 | None | 0.8922866 |
| ENSG00000177432 | 0.8546653 | -0.8915541 | None | 0.8731097 |
| ENSG00000163577 | 0.8328568 | -0.9116043 | None | 0.8722306 |
| ENSG00000176490 | 0.8489868 | -0.8936753 | None | 0.8713311 |
| ENSG00000171132 | 0.8338444 | -0.9028094 | None | 0.8683269 |
| ENSG00000128881 | 0.8698308 | -0.8654230 | None | 0.8676269 |
| ENSG00000155097 | 0.8463972 | -0.8885175 | None | 0.8674573 |
| ENSG00000196876 | 0.9445612 | -0.7816707 | 3 | 0.8631160 |
top_genes_2 = dataset %>% dplyr::select(ID, MM.C49A00, GS, gene.score) %>% filter(dataset$Module=='#C49A00') %>%
rename('MM' = 'MM.C49A00') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_2, caption='Top 10 genes for module #C49A00')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000147419 | 0.8141518 | -0.9461082 | None | 0.8801300 |
| ENSG00000134108 | 0.8057677 | -0.8920351 | None | 0.8489014 |
| ENSG00000134265 | 0.8401172 | -0.8478985 | None | 0.8440079 |
| ENSG00000165169 | 0.8519740 | -0.8173056 | None | 0.8346398 |
| ENSG00000150768 | 0.8124113 | -0.8412765 | None | 0.8268439 |
| ENSG00000134440 | 0.7982305 | -0.8361057 | None | 0.8171681 |
| ENSG00000169139 | 0.8424449 | -0.7718000 | None | 0.8071224 |
| ENSG00000138138 | 0.7972805 | -0.7841764 | None | 0.7907285 |
| ENSG00000156467 | 0.8373351 | -0.7420724 | None | 0.7897037 |
| ENSG00000075303 | 0.7775604 | -0.7943756 | None | 0.7859680 |
top_genes_3 = dataset %>% dplyr::select(ID, MM.00C094, GS, gene.score) %>% filter(dataset$Module=='#00C094') %>%
rename('MM' = 'MM.00C094') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_3, caption='Top 10 genes for module #00C094')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000026025 | 0.7987985 | 0.8872398 | None | 0.8430191 |
| ENSG00000174348 | 0.7732747 | 0.8656890 | None | 0.8194819 |
| ENSG00000173068 | 0.8396254 | 0.7969568 | None | 0.8182911 |
| ENSG00000197321 | 0.9062201 | 0.7232536 | None | 0.8147369 |
| ENSG00000142173 | 0.8963275 | 0.7327475 | None | 0.8145375 |
| ENSG00000164692 | 0.9168698 | 0.7006483 | None | 0.8087590 |
| ENSG00000108821 | 0.9018778 | 0.7113022 | None | 0.8065900 |
| ENSG00000172296 | 0.7790732 | 0.8111975 | None | 0.7951354 |
| ENSG00000096696 | 0.9085424 | 0.6529530 | None | 0.7807477 |
| ENSG00000115461 | 0.8306216 | 0.7187476 | None | 0.7746846 |
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(color = ifelse(Module=='#C17FFF', '#C17FFF',
ifelse(Module=='#00C094', '#00C094',
ifelse(Module=='#C49A00', '#C49A00', 'gray')))) %>%
mutate(alpha = ifelse(color %in% c('#C17FFF', '#00C094', '#C49A00') &
ID %in% c(as.character(top_genes_1$ID),
as.character(top_genes_2$ID),
as.character(top_genes_3$ID)), 1, 0.1))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
theme_minimal() + ggtitle('Important genes identified through WGCNA')